#delimit ;

/* this program sets up the MC loop for Table 2

It has the following inputs in the the program:

mcreps 		how many monte carlo reps
bsrep		how many bootstrap replications
numstates	how many clusters
sourcedata	a data file, which has the course data
savedata	a data file - the output of the Monte Carlo program gets written to this data file.

*/


cap prog drop do_the_monte_carlo ;
prog def do_the_monte_carlo ;
syntax  , [mcreps(integer 3) bsreps(integer 4) numstates(integer 6) sourcedata(string) savedata(string)] ;

/* get ready to run the monte carlo */

tempfile to_be_sampled main_data bsout pairs_bs_out ;

local mydate = subinstr("$S_DATE"," ","_",.) ;
cap erase "t2_post_`mydate'.dta" ;

local postlist = "b se_rob se_clu se_CR2 se_CR3 dof_CR2_IK dof_CSS se_clu_bs bs_misreps p_pairs_bs p_rad_res p_rad_res_high p_rad_res_low p_rad_unres p_webb_res " ;
cap postclose mcoutput ;
qui postfile mcoutput numobs numstates K numbsreps `postlist' using "t2_post_G`numstates'_`mydate'.dta" , replace ;

di "MC Reps = `mcreps', BS Reps = `bsreps', Number of states sampled = `numstates'" ;
di "Using data from `sourcedata'" ;

local lhs = "lnwage" ;
local key_rhs = "policy" ;
local rhs = "`key_rhs' age age2 yrseduc" ;
local command = "reg `lhs' `rhs'  , cluster(pseudo_cluster) " ;

local beta_hypothesis = 0 ;

tempfile statelist ;
qui use statefip using "`sourcedata'" ;
contract statefip ;
keep statefip ;
sort statefip ;
qui save "`statelist'" ;


/* the main loop */
qui forvalues mc = 1/`mcreps' { ;

noi di "." _continue ;
if (mod(`mc',100) == 1) { ; /* every X reps, report progress */
	noi di " `mc' " _continue ;
} ;

/* make sure we don't post last round's data to this round */
foreach zz in `postlist' { ;
	local `zz' = "." ;
} ;

/* generate the data */

use `statelist' , replace ;
bsample `numstates' ;
/* assign treatment dummy to 1/2 of states (round down) */
gen sortvar = uniform() ;
sort sortvar ;
gen policy = _n / _N <= 0.5 ;
/* generate pseudo-cluster-ids for later */
sort statefip ;
qui by statefip: gen subid = _n ;
egen pseudo_cluster = group(statefip subid) ;
keep statefip subid pseudo_cluster policy ;
sort statefip subid ;
save `to_be_sampled' , replace ;

keep statefip ;
contract statefip ;
sort statefip ;
merge 1:m statefip using "`sourcedata'" , assert(match using) keep(match) nogenerate  ;

gen personid = _n ;
expand _freq ;  /* this was generated by the contract command; tells us how many replicates of each state to make */
sort personid ;
qui by personid: gen subid = _n ;
sort statefip subid ;
merge m:1 statefip subid using `to_be_sampled' , assert(match) keep(match) nogenerate ;


/* estimate the models */

reg lnwage policy age age2 yrseduc  , robust ;
local b = _b[policy] ;
local se_rob = _se[policy] ;
local K = e(rank) ;
local numobs = e(N) ;

tempname betavec v_naive ;
matrix `betavec' = e(b) ;

/* generate residuals.  Also, generate "restricted, impose the null hypothesis" residuals */
predict yhat , xb ;
predict resid , resid ;

replace `lhs' = `lhs' - `beta_hypothesis' * `key_rhs' ;
local shortcommand = subinstr("`command'","`key_rhs'","",.) ;
`shortcommand' ;
predict resid_restricted , residual ;
predict yhat_restricted , xb ;
replace yhat_restricted = yhat_restricted + (`beta_hypothesis' * `key_rhs') ;
replace `lhs' = `lhs' + `beta_hypothesis' * `key_rhs' ;


reg lnwage policy age age2 yrseduc  , cluster(pseudo_cluster) ;
local se_clu = _se[policy] ;

gen one = 1 ;
sort pseudo_cluster ;
gen obsnum = _n ;
save `main_data' , replace ;


/* get the CR2, CR3 standard errors adjustments.  Also get I-K degrees of freedom, and CSS effective number of clusters */
CR23_IK_CSS , betavec(`betavec') lhs(`lhs') rhs(`rhs') key_rhs(`key_rhs') main_data("`main_data'") ;

foreach r in se_CR2 se_CR3 dof_CR2_IK dof_CSS { ;
	local `r' = r(`r') ;
} ;



/* do nonparametric bootstraps: for standard errors */
use `main_data' , replace ;
cap erase `pairs_bs_out' ;

qui bootstrap _b _se , reps(`bsreps') nodots cluster(pseudo_cluster) idcluster(pseudo_cluster_2) saving(`pairs_bs_out' , double)
	: reg lnwage policy age age2 yrseduc  , cluster(pseudo_cluster_2) ;
local se_clu_bs = _se[policy] ;
local bs_misreps = e(N_misreps) ;

use `pairs_bs_out' , replace ;
keep _b_policy _se_policy ;
gen t_pairs_bs = ((_b_policy - `b') / _se_policy) ;
keep t_pairs_bs ;

summ ;
save `pairs_bs_out' , replace ;

/* Do percentile-T bootstraps */
local main_t = (`b' - `beta_hypothesis') / `se_clu' ;

cap postclose bs_output ;
cap erase `bsout' ;
postfile bs_output t_rad_res t_rad_unres t_webb_res using `bsout' ;

qui forvalues bb = 1/`bsreps' { ;

	/* for the wild bootstrap */
	/* take the cluster list, generate 3 sets of residual transformations */
	/* then merge these back onto main dataset, created transformed residuals and then transformed y-hats */
	/* then estimate the models, and save the t-statistics */

	use pseudo_cluster using `to_be_sampled' , replace ;
	gen my_uniform = uniform() ;
	gen wild_rademacher = -1 + 2 * (my_uniform >= 0.5) ;
	gen wild_webb = 	(-1) * sqrt(1.5) * (my_uniform > (0) & my_uniform <= (1/6)) +  
						(-1) * sqrt(1) * (my_uniform > (1/6) & my_uniform <= (2/6))  + 
						(-1) * sqrt(0.5) * (my_uniform > (2/6) & my_uniform <= (3/6)) + 
						(+1) * sqrt(0.5) * (my_uniform > (3/6) & my_uniform <= (4/6)) + 
						(+1) * sqrt(1) * (my_uniform > (4/6) & my_uniform <= (5/6))  + 
						(+1) * sqrt(1.5) * (my_uniform > (5/6) & my_uniform <= (6/6)) ; 
	
	keep pseudo_cluster wild_rademacher wild_webb ;
	sort pseudo_cluster ;
	merge 1:m pseudo_cluster using `main_data' , assert(match) keep(match) nogenerate ;	
	
	/* create transformed residuals and new wild-outcome-variables */
	gen resid_wild_rad_restricted = resid_restricted * wild_rademacher ;
	gen resid_wild_rad_unrestricted = resid * wild_rademacher ;
	gen resid_wild_webb_restricted = resid_restricted * wild_webb ;
	
	gen y_wild_rademacher_restricted = yhat_restricted + resid_wild_rad_restricted ;
	gen y_wild_rademacher_unrestricted = yhat + resid_wild_rad_unrestricted ;
	gen y_wild_webb_restricted = yhat_restricted + resid_wild_webb_restricted ;

	/* now estimate cluster-robust models on each of these three, generating t-statistics.
		For the restricted model, the t-stat is based on the null hypothesis.  for the unrestricted
		model the t-stat is based on the main (first) estiamted beta */
	
	local shortcommand = subinstr("`command'","`lhs'","y_wild_rademacher_restricted",.) ;
	`shortcommand' ;
	local b_wild_rademacher_restricted = _b[`key_rhs'] ;
	local se_wild_rademacher_restricted = _se[`key_rhs'] ;

	local shortcommand = subinstr("`command'","`lhs'","y_wild_rademacher_unrestricted",.) ;
	`shortcommand' ;
	local b_wild_rademacher_unrestricted = _b[`key_rhs'] ;
	local se_wild_rademacher_unrestricted = _se[`key_rhs'] ;

	local shortcommand = subinstr("`command'","`lhs'","y_wild_webb_restricted",.) ;
	`shortcommand' ;
	local b_wild_webb_restricted = _b[`key_rhs'] ;
	local se_wild_webb_restricted = _se[`key_rhs'] ;

	/* make the t-stats ; store away into a postfile */

	local t_wild_rademacher_restricted  = (`b_wild_rademacher_restricted ' - `beta_hypothesis') 
		/ `se_wild_rademacher_restricted' ;
	local t_wild_rademacher_unrestricted  = (`b_wild_rademacher_unrestricted ' - `b') 
		/ `se_wild_rademacher_unrestricted' ;
	local t_wild_webb_restricted  = (`b_wild_webb_restricted ' - `beta_hypothesis') 
		/ `se_wild_webb_restricted' ;

	post bs_output (`t_wild_rademacher_restricted') (`t_wild_rademacher_unrestricted') (`t_wild_webb_restricted') ;
	
} ;

postclose bs_output ;

/* identify percentiles in the bootstrap distribution, to get p-values and/or rejection rates */

/* to get these percentiles, we will make lists of t-stats, with the "main" t-stat stuck into this list
we wante to know what the p-value of this main t-stat is.  Because with wild, few clusters, we can get intervals,
we will use mean-p-value [is this what we want?] of the interval.   */

drop _all ;
use `bsout' ;
merge 1:1 _n using `pairs_bs_out' , nogenerate ;
summ ;
save `bsout' , replace ;

drop _all ;
set obs 1 ;
gen t_rad_res = `main_t' ;
gen t_rad_unres = `main_t' ;
gen t_webb_res = `main_t' ;
gen t_pairs_bs = `main_t' ;

//summ ;
append using `bsout' ;
gen rank = . ;
//summ ;

/*
//noi di "main_t = `main_t'" ;
//noi tab t_rad_res ;
//noi tab t_rad_unres ;
//noi tab t_webb_res ;
*/

foreach var in pairs_bs rad_res rad_unres webb_res { ;

	sort t_`var' ;
	replace rank = _n ;
	summ t_`var' ;
	local maxrank = r(N) ;    /* should be #bs reps; but this allows for missing bs values */
	summ rank if abs(t_`var' - `main_t') < 0.0001 ; /* allow for machine error in t-stat computations in bootstrap step */
	local meanrank = r(mean) ;
	local toprank = r(max) ;
	local botrank = r(min) ;
	local pctile = `meanrank' / `maxrank' ;
	local myp = 2 * min(`pctile' , (1-`pctile')) ;
	local p_`var' = `myp' ;
	
	/* for two point wild bootstraps with few clusters, have interval-p-value problem.  This gets
			at ends of the interval */
	local pctile_top = `toprank' / `maxrank' ;
	local myp1 = 2 * min(`pctile_top' , (1-`pctile_top')) ;
	local pctile_bot = `botrank' / `maxrank' ;
	local myp2 = 2 * min(`pctile_bot' , (1-`pctile_bot')) ;

	local p_`var'_high = max(`myp1',`myp2') ;
	local p_`var'_low = min(`myp1',`myp2') ;
	
} ;

/*
//foreach zz in `postlist' { ;
//	noi di "`zz' = ``zz'' " ;
//} ;
*/

/* save the results */

post mcoutput (`numobs') (`numstates') (`K') (`bsreps') (`b') (`se_rob') (`se_clu') (`se_CR2') (`se_CR3') (`dof_CR2_IK') (`dof_CSS') (`se_clu_bs') (`bs_misreps')
	(`p_pairs_bs') (`p_rad_res') (`p_rad_res_high') (`p_rad_res_low') (`p_rad_unres') (`p_webb_res') ;

} ;


/* now, take the whole package of results and save them */
postclose mcoutput ;

use "t2_post_G`numstates'_`mydate'.dta" , replace ;
qui save `savedata' , replace ;

di ;


end ;

